#didn’t like output with the flex_board screen formating so I commented out and made it html. I left it in the code so you saw that I did attempt it.

Load library

library(naniar)
library(flextable)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::compose() masks flextable::compose()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(ggplot2)
library(dplyr)
library(knitr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## 
## The following objects are masked from 'package:flextable':
## 
##     as_image, footnote
library(ggbeeswarm)
library(skimr)
## 
## Attaching package: 'skimr'
## 
## The following object is masked from 'package:naniar':
## 
##     n_complete
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following objects are masked from 'package:flextable':
## 
##     highlight, style
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(DT)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(boot)

#set theme to bw
theme_set(theme_bw())

Data import

# Import data 
census_data <- read.csv("C:/Users/spsky/OneDrive/UoM/R/Final Project/census_data_county_2009-2021.cvs")

# first few rows of the data
head(census_data)
##   geoid            county_state year population median_income
## 1  1001 Autauga County, Alabama 2009      49584         51463
## 2  1003 Baldwin County, Alabama 2009     171997         48918
## 3  1005 Barbour County, Alabama 2009      29663         32537
## 4  1007    Bibb County, Alabama 2009      21464         41236
## 5  1009  Blount County, Alabama 2009      56804         45406
## 6  1011 Bullock County, Alabama 2009      10917         26209
##   median_monthly_rent_cost median_monthly_home_cost prop_female prop_male
## 1                      779                      854   0.5148233 0.4851767
## 2                      788                      846   0.5100903 0.4899097
## 3                      499                      535   0.4711594 0.5288406
## 4                      506                      466   0.4798733 0.5201267
## 5                      541                      653   0.5032744 0.4967256
## 6                      363                      477   0.4352844 0.5647156
##   prop_poverty
## 1    0.1030618
## 2    0.1192494
## 3    0.2474965
## 4    0.1189653
## 5    0.1095393
## 6    0.3227418

Exploratory Data Analysis (EDA)

# Data skim
skim(census_data)
Data summary
Name census_data
Number of rows 41867
Number of columns 10
_______________________
Column type frequency:
character 1
numeric 9
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
county_state 0 1 16 42 0 3228 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
geoid 0 1 31394.62 16291.12 1001.00 19032.00 30025.00 46107.00 72153.00 ▅▇▆▆▁
year 0 1 2015.00 3.74 2009.00 2012.00 2015.00 2018.00 2021.00 ▇▅▇▅▇
population 0 1 99198.02 318133.15 41.00 11218.00 25987.00 66375.50 10105722.00 ▇▁▁▁▁
median_income 4 1 48034.94 14258.33 10499.00 39167.00 46215.00 54732.00 156821.00 ▃▇▁▁▁
median_monthly_rent_cost 30 1 703.80 214.98 99.00 567.00 658.00 784.00 2599.00 ▅▇▁▁▁
median_monthly_home_cost 18 1 775.03 356.61 103.00 543.00 684.00 910.00 3254.00 ▇▅▁▁▁
prop_female 0 1 0.50 0.02 0.19 0.49 0.50 0.51 0.63 ▁▁▁▇▁
prop_male 0 1 0.50 0.02 0.37 0.49 0.50 0.51 0.81 ▁▇▁▁▁
prop_poverty 1 1 0.17 0.08 0.00 0.11 0.15 0.20 0.67 ▆▇▁▁▁
# Display data structure
str(census_data)
## 'data.frame':    41867 obs. of  10 variables:
##  $ geoid                   : int  1001 1003 1005 1007 1009 1011 1013 1015 1017 1019 ...
##  $ county_state            : chr  "Autauga County, Alabama" "Baldwin County, Alabama" "Barbour County, Alabama" "Bibb County, Alabama" ...
##  $ year                    : int  2009 2009 2009 2009 2009 2009 2009 2009 2009 2009 ...
##  $ population              : int  49584 171997 29663 21464 56804 10917 20189 112969 34704 24427 ...
##  $ median_income           : int  51463 48918 32537 41236 45406 26209 31938 38382 32433 40240 ...
##  $ median_monthly_rent_cost: int  779 788 499 506 541 363 453 571 552 450 ...
##  $ median_monthly_home_cost: int  854 846 535 466 653 477 528 628 556 484 ...
##  $ prop_female             : num  0.515 0.51 0.471 0.48 0.503 ...
##  $ prop_male               : num  0.485 0.49 0.529 0.52 0.497 ...
##  $ prop_poverty            : num  0.103 0.119 0.247 0.119 0.11 ...
# Summary of important variables
summary(census_data)
##      geoid       county_state            year        population      
##  Min.   : 1001   Length:41867       Min.   :2009   Min.   :      41  
##  1st Qu.:19032   Class :character   1st Qu.:2012   1st Qu.:   11218  
##  Median :30025   Mode  :character   Median :2015   Median :   25987  
##  Mean   :31395                      Mean   :2015   Mean   :   99198  
##  3rd Qu.:46107                      3rd Qu.:2018   3rd Qu.:   66376  
##  Max.   :72153                      Max.   :2021   Max.   :10105722  
##                                                                      
##  median_income    median_monthly_rent_cost median_monthly_home_cost
##  Min.   : 10499   Min.   :  99.0           Min.   : 103            
##  1st Qu.: 39167   1st Qu.: 567.0           1st Qu.: 543            
##  Median : 46215   Median : 658.0           Median : 684            
##  Mean   : 48035   Mean   : 703.8           Mean   : 775            
##  3rd Qu.: 54732   3rd Qu.: 784.0           3rd Qu.: 910            
##  Max.   :156821   Max.   :2599.0           Max.   :3254            
##  NA's   :4        NA's   :30               NA's   :18              
##   prop_female       prop_male       prop_poverty   
##  Min.   :0.1917   Min.   :0.3736   Min.   :0.0000  
##  1st Qu.:0.4950   1st Qu.:0.4882   1st Qu.:0.1129  
##  Median :0.5044   Median :0.4956   Median :0.1515  
##  Mean   :0.5000   Mean   :0.5000   Mean   :0.1659  
##  3rd Qu.:0.5118   3rd Qu.:0.5050   3rd Qu.:0.1973  
##  Max.   :0.6264   Max.   :0.8083   Max.   :0.6707  
##                                    NA's   :1
# Missing values
gg_miss_var(census_data)

String Manipulation

# Manipulate strings in the dataset and append new columns with the changes
census_data <- census_data |>
  mutate(
    new_county_state = str_to_title(county_state),
    new_county_state = str_replace_all(new_county_state, " County", ""),
    year = str_trim(as.character(year))
  )

# Display the first few rows of the updated dataset
head(census_data)
##   geoid            county_state year population median_income
## 1  1001 Autauga County, Alabama 2009      49584         51463
## 2  1003 Baldwin County, Alabama 2009     171997         48918
## 3  1005 Barbour County, Alabama 2009      29663         32537
## 4  1007    Bibb County, Alabama 2009      21464         41236
## 5  1009  Blount County, Alabama 2009      56804         45406
## 6  1011 Bullock County, Alabama 2009      10917         26209
##   median_monthly_rent_cost median_monthly_home_cost prop_female prop_male
## 1                      779                      854   0.5148233 0.4851767
## 2                      788                      846   0.5100903 0.4899097
## 3                      499                      535   0.4711594 0.5288406
## 4                      506                      466   0.4798733 0.5201267
## 5                      541                      653   0.5032744 0.4967256
## 6                      363                      477   0.4352844 0.5647156
##   prop_poverty new_county_state
## 1    0.1030618 Autauga, Alabama
## 2    0.1192494 Baldwin, Alabama
## 3    0.2474965 Barbour, Alabama
## 4    0.1189653    Bibb, Alabama
## 5    0.1095393  Blount, Alabama
## 6    0.3227418 Bullock, Alabama

Data Cleaning

census_data_sample <- census_data|>
  filter(!is.na(median_income))|>
  select(new_county_state, year, population, median_income, prop_female, prop_male, prop_poverty, median_monthly_rent_cost, median_monthly_home_cost)

#cleaned and sampled data
head(census_data_sample)
##   new_county_state year population median_income prop_female prop_male
## 1 Autauga, Alabama 2009      49584         51463   0.5148233 0.4851767
## 2 Baldwin, Alabama 2009     171997         48918   0.5100903 0.4899097
## 3 Barbour, Alabama 2009      29663         32537   0.4711594 0.5288406
## 4    Bibb, Alabama 2009      21464         41236   0.4798733 0.5201267
## 5  Blount, Alabama 2009      56804         45406   0.5032744 0.4967256
## 6 Bullock, Alabama 2009      10917         26209   0.4352844 0.5647156
##   prop_poverty median_monthly_rent_cost median_monthly_home_cost
## 1    0.1030618                      779                      854
## 2    0.1192494                      788                      846
## 3    0.2474965                      499                      535
## 4    0.1189653                      506                      466
## 5    0.1095393                      541                      653
## 6    0.3227418                      363                      477

Summary statistics Create a Sample of the Data

# Sample of around 500 data observations
census_data_sample <- census_data |> sample_frac(0.012)

# Check the sample size
nrow(census_data_sample)
## [1] 502

Group-level Summary Statistics

# Summary statistics by new_county_state
state_summary <- census_data |>
  group_by(new_county_state) |>
  summarize(
    avg_income = round(mean(median_income, na.rm = TRUE), 0),
    avg_population = round(mean(population, na.rm = TRUE), 0)
  ) |>
  rename(
    `County, State` = new_county_state,
    `Income` = avg_income,
    `Population` = avg_population
  ) |>
  arrange(`Income`)

# Convert income and population formats
state_summary <- state_summary |>
  mutate(
    Income = dollar(Income),
    Population = comma(Population)
  )

# Display the first 50 rows of the summary table
knitr::kable(head(state_summary, 50), caption = "Average County-level Income Statistics (Lowest 50)")
Average County-level Income Statistics (Lowest 50)
County, State Income Population
Adjuntas Municipio, Puerto Rico $12,481 18,731
Guánica Municipio, Puerto Rico $12,905 18,152
Comerío Municipio, Puerto Rico $13,230 19,999
Maricao Municipio, Puerto Rico $13,334 6,206
Lares Municipio, Puerto Rico $13,500 29,208
Lajas Municipio, Puerto Rico $14,048 24,677
San Sebastián Municipio, Puerto Rico $14,262 40,650
Orocovis Municipio, Puerto Rico $14,427 22,519
Las Marías Municipio, Puerto Rico $14,440 9,431
Moca Municipio, Puerto Rico $14,455 38,830
Ciales Municipio, Puerto Rico $14,604 17,970
Mayagüez Municipio, Puerto Rico $14,639 83,133
Yauco Municipio, Puerto Rico $14,804 39,533
Patillas Municipio, Puerto Rico $15,018 18,261
Utuado Municipio, Puerto Rico $15,094 31,281
Salinas Municipio, Puerto Rico $15,309 29,734
Quebradillas Municipio, Puerto Rico $15,410 25,097
Isabela Municipio, Puerto Rico $15,426 44,214
San Germán Municipio, Puerto Rico $15,481 33,967
Corozal Municipio, Puerto Rico $15,493 35,795
Guayanilla Municipio, Puerto Rico $15,722 20,362
Barranquitas Municipio, Puerto Rico $15,831 29,562
Aguada Municipio, Puerto Rico $15,875 40,533
Arroyo Municipio, Puerto Rico $15,895 18,642
Aguadilla Municipio, Puerto Rico $15,978 58,050
Peñuelas Municipio, Puerto Rico $16,077 22,958
Sabana Grande Municipio, Puerto Rico $16,243 24,302
Vieques Municipio, Puerto Rico $16,261 9,000
Jayuya Municipio, Puerto Rico $16,285 15,879
Barceloneta Municipio, Puerto Rico $16,308 24,309
Aguas Buenas Municipio, Puerto Rico $16,407 27,516
Yabucoa Municipio, Puerto Rico $16,627 35,973
Cabo Rojo Municipio, Puerto Rico $16,646 49,976
Guayama Municipio, Puerto Rico $16,889 43,077
Ponce Municipio, Puerto Rico $17,009 155,669
Morovis Municipio, Puerto Rico $17,169 31,797
Naranjito Municipio, Puerto Rico $17,221 29,495
Arecibo Municipio, Puerto Rico $17,435 92,297
Camuy Municipio, Puerto Rico $17,500 34,032
Florida Municipio, Puerto Rico $17,521 12,505
Santa Isabel Municipio, Puerto Rico $17,739 22,514
Coamo Municipio, Puerto Rico $17,744 39,545
Añasco Municipio, Puerto Rico $17,767 28,250
Aibonito Municipio, Puerto Rico $17,818 24,827
Naguabo Municipio, Puerto Rico $17,846 26,138
Villalba Municipio, Puerto Rico $17,924 24,759
Vega Baja Municipio, Puerto Rico $17,929 56,910
San Lorenzo Municipio, Puerto Rico $17,981 39,697
Manatí Municipio, Puerto Rico $18,082 42,258
Hatillo Municipio, Puerto Rico $18,123 41,137

Frequency Table of Two Categorical Variables

# Frequency table of county_state and year with average median_monthly_rent_cost
state_year_rent_table <- census_data_sample |>
  group_by(new_county_state, year) |>
  summarize(
    avg_median_rent = round(mean(median_monthly_rent_cost, na.rm = TRUE), 0)
  ) |>
  ungroup() |>
  select(`County, State` = new_county_state, Year = year, `Rent` = avg_median_rent) |>
  arrange(desc(Rent)) |>
  mutate(Rent = dollar(Rent))
## `summarise()` has grouped output by 'new_county_state'. You can override using
## the `.groups` argument.
# Display the first 50 rows of the frequency table
knitr::kable(head(state_year_rent_table, 50), caption = "Frequency Table of County, Year and Average Median Rent \n(Top 50)")
Frequency Table of County, Year and Average Median Rent (Top 50)
County, State Year Rent
Arlington, Virginia 2020 $2,005
San Francisco, California 2017 $1,709
Charles, Maryland 2018 $1,661
Queens, New York 2017 $1,456
Sacramento, California 2021 $1,434
Placer, California 2016 $1,339
Ouray, Colorado 2020 $1,305
Cherokee, Georgia 2020 $1,304
Routt, Colorado 2019 $1,282
Broward, Florida 2017 $1,271
Northwest Arctic Borough, Alaska 2021 $1,266
Los Angeles, California 2016 $1,264
Williamson, Texas 2018 $1,248
Hays, Texas 2021 $1,235
Manassas City, Virginia 2010 $1,232
Fulton, Georgia 2019 $1,205
Morris, New Jersey 2009 $1,199
Rockwall, Texas 2009 $1,183
Davidson, Tennessee 2020 $1,172
Midland, Texas 2016 $1,148
Rockwall, Texas 2011 $1,142
Essex, Massachusetts 2017 $1,135
Palm Beach, Florida 2010 $1,129
Ramsey, Minnesota 2021 $1,120
St. Johns, Florida 2015 $1,119
Union, New Jersey 2011 $1,119
Portsmouth City, Virginia 2021 $1,116
Will, Illinois 2017 $1,112
Kent, Rhode Island 2020 $1,110
Delaware, Pennsylvania 2020 $1,109
Goochland, Virginia 2017 $1,096
Williamsburg City, Virginia 2015 $1,093
Chesapeake City, Virginia 2011 $1,090
Chesterfield, Virginia 2013 $1,077
Hamilton, Indiana 2016 $1,071
James City, Virginia 2010 $1,066
Essex, Massachusetts 2014 $1,063
Anchorage Municipality, Alaska 2011 $1,058
Dare, North Carolina 2017 $1,056
Canadian, Oklahoma 2021 $1,054
Chesterfield, Virginia 2012 $1,052
Baltimore City, Maryland 2018 $1,051
Hillsborough, Florida 2017 $1,040
Belknap, New Hampshire 2020 $1,039
Warren, Virginia 2019 $1,020
Sarasota, Florida 2014 $1,008
St. Tammany Parish, Louisiana 2016 $998
Pinal, Arizona 2016 $990
Jefferson, New York 2019 $987
Hanover, Virginia 2010 $977

Data Visualizations Visualization 1: Lowest Average Population

# Filter for the 10 counties with the lowest average populations over the years
lowest_10_population_counties <- census_data_sample |>
  group_by(new_county_state) |>
  summarize(avg_population = mean(population, na.rm = TRUE)) |>
  top_n(-10, avg_population)

# Filter the original data to include only these counties
lowest_10_population_data <- census_data_sample |>
  filter(new_county_state %in% lowest_10_population_counties$new_county_state)

# Summarize the average population for each county
lowest_10_population_summary <- lowest_10_population_data |>
  group_by(new_county_state) |>
  summarize(avg_population = mean(population, na.rm = TRUE))

# Create a horizontal bar chart
ggplot(lowest_10_population_summary, aes(x = reorder(new_county_state, avg_population), y = avg_population)) +
  geom_bar(stat = "identity", fill = "lightblue", color = "black") +
  coord_flip() +
  labs(title = "10 Counties with the \nLowest Average Population", x = "County", y = "Population",
       caption = "Source: Census Data Sample") +
  scale_y_continuous(expand = expansion(mult = c(0, 0.1)))+
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 7)
  )

Visualization 2: Median Income Distribution

# Filter for the 10 counties with the highest median incomes
top_10_income_counties <- census_data |>
  group_by(new_county_state) |>
  summarize(median_income = mean(median_income, na.rm = TRUE)) |>
  top_n(10, median_income)

# Filter the original data to include only these counties
top_10_income_data <- census_data |>
  filter(new_county_state %in% top_10_income_counties$new_county_state)

# Median income distribution by top 10 counties using census data
ggplot(top_10_income_data, aes(x = new_county_state, y = round(median_income, 0))) +
  geom_boxplot(fill = "yellow", color = "black") +
  scale_y_continuous(labels = dollar_format(prefix = "$", suffix = "", big.mark = ","), limits = c(80000, 160000), breaks = seq(0, 160000, by = 20000)) +
  labs(
    title = "Median Income Distribution in the \n(Top 10 Counties)",
    x = "County",
    y = "Income",
    caption = "Source: Census Data") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 7)
  ) +
  coord_flip()

Visualization 3: Population of Females by Top 10 States

# Filter for the 10 states with the highest average female population
top_10_female_population_states <- census_data |>
  group_by(new_county_state) |>
  summarize(avg_prop_female = mean(prop_female, na.rm = TRUE)) |>
  top_n(10, avg_prop_female)

# Filter the original data to include only these states
top_10_female_population_data <- census_data |>
  filter(new_county_state %in% top_10_female_population_states$new_county_state)

# Create a heatmap of the proportion of females by top 10 states using sample data
ggplot(top_10_female_population_data, aes(x = new_county_state, y = year, fill = round(prop_female * 100, 0))) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Proportion of Females (%)") +
  labs(
    title = "Population of Females by Top 10 States",
    x = "State",
    y = "Year",
    caption = "Source: Census Data"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 6),
    axis.text.y = element_text(size = 6),
    legend.text = element_text(size = 6),
    legend.title = element_text(size = 6),
    plot.title = element_text(size = 10),
    plot.caption = element_text(size = 8)
  )

Interpretation of the data: We expect that in Norton City, Virginia, in the year 2014, the population of females reached 63%, making it the highest among the top 10 states with the highest average female population. This suggests that you would most likely find a bride in Norton City, Virginia, during that year based on the highest female population proportion.

Population of Males by Top 10 States

# Filter for the 10 states with the highest average male population
top_10_male_population_states <- census_data |>
  group_by(new_county_state) |>
  summarize(avg_prop_male = mean(prop_male, na.rm = TRUE)) |>
  top_n(10, avg_prop_male)

# Filter the original data to include only these states
top_10_male_population_data <- census_data |>
  filter(new_county_state %in% top_10_male_population_states$new_county_state)

# Create a heatmap of the proportion of males by top 10 states using sample data
ggplot(top_10_male_population_data, aes(x = new_county_state, y = year, fill = round(prop_male * 100, 0))) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "lightblue", high = "darkblue", name = "Proportion of Males (%)") +
 
  labs(title = "Population of Males by Top 10 States ", x = "State", y = "Year",
caption = "Source: Cenus Data") +
    theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 6),
    axis.text.y = element_text(size = 6),
    legend.text = element_text(size = 6),
    legend.title = element_text(size = 6),
    plot.title = element_text(size = 10),
    plot.caption = element_text(size = 8)
  )

Interpretation of the data: In 2017, Forest, Pennsylvania had the highest population of males, reaching up to 80%. This makes it the state and year where you would most likely find a husband based on the highest male population.

Visualization 4: Population in Poverty Trend

lowest_10_poverty_counties <- census_data_sample |>
  group_by(new_county_state) |>
  summarize(avg_prop_poverty = mean(prop_poverty, na.rm = TRUE)) |>
  top_n(-10, avg_prop_poverty)

# Filter the original data to include only these counties
lowest_10_poverty_data <- census_data_sample |>
  filter(new_county_state %in% lowest_10_poverty_counties$new_county_state)

# Plot the trend of poverty rates over time for the lowest 10 counties
ggplot(lowest_10_poverty_data, aes(x = year, y = prop_poverty * 100, color = new_county_state, group = new_county_state)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  scale_y_continuous(labels = function(x) paste0(x, "%")) +
  labs(
    title = "Trend of Poverty Rates Over Time for the \n10 Counties with the Lowest Average Poverty Rates",
    x = "Year",
    y = "Population in Poverty",
    color = "County",
    caption = "Source: Census Data Sample"
  )+
  theme(axis.text.y = element_text(size = 7),
    axis.text.x = element_text(angle = 45, hjust = 1, size = 7)
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Visualization 5: Interactive comparison of Median income level with Median Rent and Mortgage Costs

# Convert 'year' to numeric
census_data <- census_data |> mutate(year = as.numeric(year))

# Summary of median income, rent cost, and home cost
summary_data <- census_data |>
  group_by(year) |>
  summarise(
     mean_income = round(mean(median_income, na.rm = TRUE), 2),
    mean_rent_cost = round(mean(median_monthly_rent_cost, na.rm = TRUE), 2),
    mean_home_cost = round(mean(median_monthly_home_cost, na.rm = TRUE), 2)
  )

# Melt the data for faceting
summary_data_long <- summary_data |>
  pivot_longer(cols = c(mean_income, mean_rent_cost, mean_home_cost),
               names_to = "variable",
               values_to = "value")

# Recode the variable names
summary_data_long <- summary_data_long |>
  mutate(variable = recode(variable,
                           mean_income = "Income",
                           mean_rent_cost = "Rent",
                           mean_home_cost = "Mortgage"))

# Plot the trends
comparison_plot <- ggplot(summary_data_long, aes(x = year, y = value, color = variable)) +
  geom_line(size = 1.2) +
  scale_y_continuous(labels = dollar_format(prefix = "$", suffix = "", big.mark = ",", accuracy = 0.01)) +
   facet_wrap(~ variable, scales = "free_y", ncol = 1) +
  labs(
    title = 'Comparison of Income, Rent and Mortgage Cost \n(2009-2021)',
    x = 'Year',
    y = 'Amount',
    caption = "Data Source: United States Census Data",
    color = "Variable"
  )
# Convert ggplot to plotly
ggplotly(comparison_plot)

Beeswarm Plot

# Filter counties with median income between 120,000 and 160,000
income_filtered <- census_data|>
  filter(median_income >= 120000 & median_income <= 160000)

# beeswarm plot
ggplot(income_filtered, aes(x = reorder(new_county_state, -median_income), y = median_income)) +
  geom_quasirandom(alpha = 0.6, color = "blue") +
  labs(title = "Beeswarm Plot of Median Income by County\n(120,000 and 160,000)",
       x = "County",
       y = "Income",
caption = "Source: Cenus Data") +
  scale_y_continuous(labels = dollar_format(prefix = "$", suffix = "", big.mark = ","), breaks = seq(120000, 160000, by = 15000)) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 7)
  ) +
  guides(color=guide_legend(title="County"))

Permutation Testing

# Permutation test for difference in median income between two counties
county1 <- "Antrim County, Michigan"
county2 <- "Wayne County, Michigan"

# Define the function to compute income difference
income_diff <- function(data, county1, county2) {
  mean(data[data$county_state == county1, ]$median_income, na.rm = TRUE) - 
  mean(data[data$county_state == county2, ]$median_income, na.rm = TRUE)
}

# Calculate the observed difference
observed_diff <- income_diff(census_data, county1, county2)

# Generate permuted differences
permuted_diffs <- replicate(1000, {
  permuted_data <- census_data
  permuted_data$county_state <- sample(permuted_data$county_state)
  income_diff(permuted_data, county1, county2)
})

# Convert the permuted differences to a data frame
permuted_diffs_df <- data.frame(permuted_diffs = permuted_diffs)

# Plot the permuted differences
ggplot(permuted_diffs_df, aes(x = permuted_diffs)) +
  geom_histogram(binwidth = 500, fill = "blue", alpha = 0.7, color = "black") +
  geom_vline(aes(xintercept = observed_diff), color = "red", linetype = "dashed", size = 1) +
  labs(title = "Permutation Test for Median Income Difference", x = "Income Difference", y = "Frequency",
caption = "Source: Cenus Data") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 7)
  ) 

Bootstrap

# Set seed for reproducibility
set.seed(1971)

# Number of bootstrap samples
B <- 1000

# Instantiating matrix for bootstrap samples
bootstrap_samples <- matrix(NA, nrow = nrow(census_data_sample), ncol = B)

# Sampling with replacement B times
for (b in 1:B) {
  bootstrap_samples[, b] <- census_data_sample |>
    slice_sample(prop = 1, replace = TRUE) |>
    dplyr::pull(median_income)
}

# Calculate bootstrap medians
bootstrap_medians <- apply(bootstrap_samples, 2, median, na.rm = TRUE)

# Calculate bootstrap confidence interval
boot_ci <- quantile(bootstrap_medians, probs = c(0.025, 0.975))

# Create a histogram of the bootstrap medians
ggplot(data.frame(medians = bootstrap_medians), aes(x = round(medians, 0))) +
  geom_histogram(binwidth = 500) +
  geom_vline(aes(xintercept = round(boot_ci[1], 0)), color = "red") +
  geom_vline(aes(xintercept = round(boot_ci[2], 0)), color = "red") +
  labs(
    title = "Bootstrap Confidence Interval for Median Income",
    x = "Income",
    y = "Frequency",
    caption = "Source: Census Data Sample"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 7)
  )

Data Dictionary

# Display data dictionary
data_dictionary <- tibble(
  variable = c("geoid", "county_state", "year", "population", "median_income", 
               "median_monthly_rent_cost", "median_monthly_home_cost", "prop_female", 
               "prop_male", "prop_poverty"),
  description = c("Geographic region ID", "Geographic region", "Year", "Population", 
                  "Median income in dollars", "Median monthly housing costs for homeowners in dollars", 
                  "Median monthly rent costs for renters in dollars", "Proportion of people who are female", 
                  "Proportion of people who are male", "Proportion of people living in poverty")
)

knitr::kable(data_dictionary, caption = "Data Dictionary for Census Data")
Data Dictionary for Census Data
variable description
geoid Geographic region ID
county_state Geographic region
year Year
population Population
median_income Median income in dollars
median_monthly_rent_cost Median monthly housing costs for homeowners in dollars
median_monthly_home_cost Median monthly rent costs for renters in dollars
prop_female Proportion of people who are female
prop_male Proportion of people who are male
prop_poverty Proportion of people living in poverty

Extra Credit Population Trends

# Filter data for a sample county
sample_county <- census_data |> dplyr::filter(new_county_state == 'Autauga, Alabama')

# Check the structure of sample_county to ensure it's correct
str(sample_county)
## 'data.frame':    13 obs. of  11 variables:
##  $ geoid                   : int  1001 1001 1001 1001 1001 1001 1001 1001 1001 1001 ...
##  $ county_state            : chr  "Autauga County, Alabama" "Autauga County, Alabama" "Autauga County, Alabama" "Autauga County, Alabama" ...
##  $ year                    : num  2009 2010 2011 2012 2013 ...
##  $ population              : int  49584 53155 53944 54590 54907 55136 55221 55049 55036 55200 ...
##  $ median_income           : int  51463 53255 53899 53773 53682 52475 51281 53099 55317 58786 ...
##  $ median_monthly_rent_cost: int  779 769 832 836 851 876 883 896 932 966 ...
##  $ median_monthly_home_cost: int  854 889 912 918 887 875 878 872 853 877 ...
##  $ prop_female             : num  0.515 0.515 0.515 0.514 0.512 ...
##  $ prop_male               : num  0.485 0.485 0.485 0.486 0.488 ...
##  $ prop_poverty            : num  0.103 0.106 0.109 0.116 0.121 ...
##  $ new_county_state        : chr  "Autauga, Alabama" "Autauga, Alabama" "Autauga, Alabama" "Autauga, Alabama" ...
# Plot population trend
plot <- ggplot(sample_county, aes(x = year, y = population)) +
  geom_line(size = 1.2, color = "blue") +
  scale_y_continuous(labels = scales::comma) +
  labs(
    title = 'Population Trend in Autauga County, Alabama',
    x = 'Year',
    y = 'Population',
    caption = "Data Source: Census Data"
  )

# Convert ggplot to plotly
ggplotly(plot)

Median Income Summary

# Convert 'year' back to numeric
census_data <- census_data |> mutate(year = as.numeric(year))

# Summary of median income
income_summary <- census_data |>
  group_by(year) |>
  summarise(mean_income = mean(median_income, na.rm = TRUE))

# Check the summary
print(income_summary)
## # A tibble: 13 × 2
##     year mean_income
##    <dbl>       <dbl>
##  1  2009      42818.
##  2  2010      43613.
##  3  2011      44620.
##  4  2012      44973.
##  5  2013      45265.
##  6  2014      45857.
##  7  2015      46130.
##  8  2016      47252.
##  9  2017      48995.
## 10  2018      50792.
## 11  2019      52648.
## 12  2020      54172.
## 13  2021      57327.
# Plot median income trend
income_plot <- ggplot(income_summary, aes(x = year, y = mean_income)) +
  geom_line(size = 1.2, color="blue") +
  scale_y_continuous(labels = scales::comma) +
  labs(
    title = 'Median Income Trend (2009-2021)',
    x = 'Year',
    y = 'Median Income (in USD)',
    caption = "Data Source: United States Census Data"
  )

# Convert ggplot to plotly
ggplotly(income_plot)

Median Monthly Rent Cost Summary

# Summary of median monthly rent cost
rent_summary <- census_data |>
  group_by(year) |>
  summarise(mean_rent = mean(median_monthly_rent_cost, na.rm = TRUE))

# Plot median monthly rent cost trend
rent_plot <- ggplot(rent_summary, aes(x = year, y = mean_rent)) +
  geom_line(size = 1.2, color="blue") +
  labs(
    title = 'Median Monthly Rent Cost Trend (2009-2021)',
    x = 'Year',
    y = 'Monthly Rent',
    caption = "Data Source: Census Data")

# Convert ggplot to plotly
    ggplotly(rent_plot)

Key Summary Table

# Summary table of key statistics
key_summary <- census_data |>
  group_by(year) |>
  summarise(
    mean_population = mean(population, na.rm = TRUE),
    mean_income = mean(median_income, na.rm = TRUE),
    mean_rent = mean(median_monthly_rent_cost, na.rm = TRUE),
    mean_home_cost = mean(median_monthly_home_cost, na.rm = TRUE)
  ) |>
  mutate(
    mean_population = round(mean_population),
    mean_income = round(mean_income),
    mean_rent = round(mean_rent),
    mean_home_cost = round(mean_home_cost)
  )

# Display the summary table
datatable(key_summary, options = list(pageLength = 10)) |>
  formatCurrency(columns = c('mean_income', 'mean_rent', 'mean_home_cost'), currency = '', interval = 3, mark = ',') |>
  formatRound(columns = 'mean_population', interval = 3, mark = ',', digits = 0)

Key Findings:

The project offers a thorough analysis and visual representations that provide valuable insights into socio-economic trends across various counties in the United States during the period under examination.